
#include "UTF8Tokenizer.h"
#include <iostream>

UTF8Token::UTF8Token( ::std::string iContent, TOKEN_TYPE iType )
    : mContent(iContent), mType(iType)
{
}

UTF8Token::~UTF8Token()
{
}

::std::string UTF8Token::getContent()
{
    return mContent;
}

TOKEN_TYPE UTF8Token::getType()
{
    return mType;
}

UTF8Tokenizer::UTF8Tokenizer( ::std::istream& iInputStream ) 
    : mInputStream( iInputStream ), mStringBuffer()
{
}

UTF8Tokenizer::~UTF8Tokenizer()
{
}

UTF8Token UTF8Tokenizer::nextToken()
{
	skipBlank();
	if( mInputStream.eof() )
		return UTF8Token( "EOF", TOKEN_EOF );
	// clear the state
	TOKEN_TYPE tType = TOKEN_UNKNOWN;
	mStringBuffer.str("");

	char tc;
	tc = mInputStream.get();
	mStringBuffer << tc;
	// 11110*** 10****** 10****** 10******
	if( (tc & UTF8_MASK4BYTES) == UTF8_MASK4BYTES )
	{
		for( int i = 0; i < 3; i++ )
		{
			tc = mInputStream.get();
			mStringBuffer << tc;
		}
	}
	// 1110**** 10****** 10******
	else if( (tc & UTF8_MASK3BYTES) == UTF8_MASK3BYTES )
	{
		for( int i = 0; i < 2; i++ )
		{
			tc = mInputStream.get();
			mStringBuffer << tc;
		}
	}
	// 110***** 10******
	else if( (tc & UTF8_MASK2BYTES) == UTF8_MASK2BYTES )
	{
		tc = mInputStream.get();
		mStringBuffer << tc;
	}
	// 0*******
	else if( ! (tc & UTF8_MASK1BYTE ) )
	{
		if( (tc >= 'a' && tc <='z') || (tc >= 'A' && tc <= 'Z') )
		{
			tType = TOKEN_WORD;
			tc = mInputStream.get();
			while( (tc >= 'a' && tc <='z') || (tc >= 'A' && tc <= 'Z') )
			{
				mStringBuffer << tc;
				tc = mInputStream.get();
			}
			mInputStream.unget();
		}
	}
	else
	{
		return UTF8Token( "Unvalid UTF-8 Source", TOKEN_EOF );
	}

	return UTF8Token( mStringBuffer.str(), tType );
}

void UTF8Tokenizer::skipBlank()
{
	char tc;
	while( !mInputStream.eof() )
	{
		tc = mInputStream.get();
		if( tc != ' ' && tc != '\t' && tc != '\r' && tc != '\n' )
		{
			mInputStream.unget();
			return;
		}
	}
}


